{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-storage.json'\n",
    "\n",
    "from google.cloud import storage\n",
    "client = storage.Client()\n",
    "bucket = client.bucket('mesolitica-public')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import itertools\n",
    "from tqdm import tqdm\n",
    "import json\n",
    "\n",
    "files = glob('results-semi*.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import malaya\n",
    "\n",
    "def cleaning(string):\n",
    "    splitted = malaya.text.function.split_into_sentences(string)\n",
    "    if not len(splitted):\n",
    "        splitted = '. '.join([k.strip() for k in string.split('.') if len(k.strip())])\n",
    "    if splitted[0][0] == '-':\n",
    "        splitted[0] = splitted[0].replace('- ','')\n",
    "    points = [f'{no + 1}. {malaya.text.function.transformer_textcleaning(s)}' for no, s in enumerate(splitted)]\n",
    "    points = ' '.join(points)\n",
    "    return points"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 35824/35824 [00:37<00:00, 952.03it/s] \n",
      "100%|██████████| 35824/35824 [00:44<00:00, 813.62it/s] \n",
      "100%|██████████| 35824/35824 [00:44<00:00, 799.95it/s]\n"
     ]
    }
   ],
   "source": [
    "before, after = [], []\n",
    "\n",
    "for file in files:\n",
    "    with open(file) as fopen:\n",
    "        x = json.load(fopen)\n",
    "    merged = list(itertools.chain(*x))\n",
    "    for row in tqdm(merged):\n",
    "        try:\n",
    "            if len(row) != 2:\n",
    "                continue\n",
    "            before.append(cleaning(row[1] + '.'))\n",
    "            after.append(malaya.text.function.transformer_textcleaning(row[0]))\n",
    "        except:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(107471, 107471)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(before), len(after)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://raw.githubusercontent.com/huseinzol05/Malay-Dataset/master/summarization/karangan/1.rtf\n",
    "# !pip3 install striprtf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from striprtf.striprtf import rtf_to_text\n",
    "\n",
    "with open('1.rtf') as fopen:\n",
    "    x = fopen.read()\n",
    "\n",
    "results = []\n",
    "splitted = rtf_to_text(x).split('===')\n",
    "for i in range(0, len(splitted), 2):\n",
    "    results.append(splitted[i: i + 2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "remove = ['(AR5)', '(AR1)', '(AR2)', '(AR3)', '(AR4)', '(AA)', '(AH1)', '(AH2)', '(AH3)', '(AC)',\n",
    "'(AH4)', '(AP)', '(A Ksmpln)', '(A Cdngn)', '(A Pndpt)', '(A Pntp)', '(AJ)', '(AH5)']\n",
    "def cleaning_rtf(string):\n",
    "    for r in remove:\n",
    "        string = string.replace(r, '')\n",
    "    return malaya.text.function.transformer_textcleaning(string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "for row in results:\n",
    "    b = [i for i in row[0].split('\\n') if len(i) > 2]\n",
    "    b = [f'{no + 1}. {i}' for no, i in enumerate(b)]\n",
    "    b = ' '.join(b)\n",
    "    a = cleaning_rtf(row[1])\n",
    "    \n",
    "    before.append(b)\n",
    "    after.append(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "batches = []\n",
    "batch = 20000\n",
    "for i in range(0, len(before), batch):\n",
    "    index = min(i + batch, len(before))\n",
    "    x = before[i: index]\n",
    "    y = after[i: index]\n",
    "    batches.append((x, y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "\n",
    "for i in range(len(batches)):\n",
    "    before = batches[i][0]\n",
    "    after = batches[i][1]\n",
    "    filename = f'bahasa-generator-{i}.tsv'\n",
    "    with tf.io.gfile.GFile(filename, 'w') as outfile:\n",
    "        for i in range(len(before)):\n",
    "            outfile.write('%s\\t%s\\n' % (before[i], after[i]))\n",
    "            \n",
    "    blob = bucket.blob(f't5-data/{filename}')\n",
    "    blob.upload_from_filename(filename)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
